# File:	    1_SimulatedData.R
# Project:	Colonial legacy and foreign aid: Decomposing the colonial bias
# Authors:	Daina Chiba, Department of Government, University of Essex
#           Tobias Heinrich, Department of Political Science, University of South Carolina
# Contact:	dchiba@essex.ac.uk
# Date:     19 November, 2018

# Description: 
# This R script file replicates the results reported in Tables 1 and A.1, and Figure 1 of the article. 

rm(list=ls())
library(stargazer)


# Functions and data ------------------------------------------------------

dcmp.sim <- function(data){
  # Split the data into two groups
  data.0 <- data[data $ colony == 0, ]
  data.1 <- data[data $ colony == 1, ]
  # Estimate models for each group
  fit.0 <- lm(y ~ x + x2, data = data.0)
  fit.1 <- lm(y ~ x + x2, data = data.1)
  beta.0 <- coef(fit.0)
  beta.1 <- coef(fit.1)
  xvars <- names(beta.0)
  xvars <- xvars[-1]  ## omit the intercept
  x <- cbind(1, as.matrix(data[,xvars])) ## bring back the intercept
  x.0 <- x[data $ colony == 0, ]
  x.1 <- x[data $ colony == 1, ]
  # XBs
  ly.x0.b0 <- mean(x.0 %*% beta.0)
  ly.x0.b1 <- mean(x.0 %*% beta.1)
  ly.x1.b0 <- mean(x.1 %*% beta.0)
  ly.x1.b1 <- mean(x.1 %*% beta.1)
  
  total <- ly.x1.b1 - ly.x0.b0 # Total difference
  obs <- ly.x1.b0 - ly.x0.b0   # Difference attributable to observables
  cat("% Observables   ")
  print(100 * obs/total)
  cat("% Saliency      ")
  print(100 * (total-obs)/total)  
}

# Functions for DGP
b.1 <- function(x) -(x - 5)^2 + 10
b.0 <- function(x) -0.5*(x - 5)^2 + 5
b.n <- function(x) -3*(x - 5)^2 + 10

# Number of observations in simulation
n.sim <- 200

# Generate error terms
set.seed(12345)
e1 <- rnorm(n.sim) * 1
e2 <- rnorm(n.sim) * 1



# Case 1: 0 observables ---------------------------------------------------

## X
set.seed(1234)
x.0 <- rnorm(n.sim, mean = 5, sd = 1)

## Y
y.0 <- b.0(x.0) + e1 + 3
y.1 <- b.1(x.0) + e2 + 7

mean(y.1)
mean(y.0)

data.0 <- data.frame(y = y.0, x = x.0, x2 = x.0^2, colony = 0)
data.1 <- data.frame(y = y.1, x = x.0, x2 = x.0^2, colony = 1)
data.obs.0 <- rbind(data.0, data.1)

## Decomposition
dcmp.sim(data.obs.0)


# Case 3: low observables -------------------------------------------------

set.seed(0)
x.1 <- rnorm(n.sim, mean = 5, sd = 1.5)
x.0 <- c(rnorm(n.sim/2, mean = 3, sd = 1), rnorm(n.sim/2, mean = 7, sd = 1))

y.0 <- b.0(x.0) + e1 + 5
y.1 <- b.1(x.1) + e2 + 7.5
mean(y.1) - mean(y.0)

data.0 <- data.frame(y = y.0, x = x.0, x2 = x.0^2, colony = 0)
data.1 <- data.frame(y = y.1, x = x.1, x2 = x.1^2, colony = 1)
data.obs.25 <- rbind(data.0, data.1)

## Decomposition
dcmp.sim(data.obs.25)


# Case 4: high observables ------------------------------------------------

b.n <- function(x) -0.7*(x - 5)^2 + 7

set.seed(0)
x.1 <- rnorm(n.sim, mean = 5, sd = 1.5)
x.0 <- c(rnorm(n.sim/2, mean = 2, sd = 1), rnorm(n.sim/2, mean = 8, sd = 1))

y.0 <- b.n(x.0) + e1 + 5
y.1 <- b.1(x.1) + e2 + 5
mean(y.1) - mean(y.0)

data.0 <- data.frame(y = y.0, x = x.0, x2 = x.0^2, colony = 0)
data.1 <- data.frame(y = y.1, x = x.1, x2 = x.1^2, colony = 1)
data.obs.75 <- rbind(data.0, data.1)

## Decomposition
dcmp.sim(data.obs.75)


# Case 2: 100% observables ------------------------------------------------
set.seed(123)
x.1 <- rnorm(n.sim, mean = 5, sd = 1.5)
x.0 <- c(rnorm(n.sim/2, mean = 2, sd = 1), rnorm(n.sim/2, mean = 8, sd = 1))

b.1a <- function(x) -(x - 5)^2 + 10
b.1b <- function(x) -0.8*(x - 5)^2 + 10

y.0 <- b.1a(x.0) + e1 + 5
y.1 <- b.1b(x.1) + e2 + 5
mean(y.0)
mean(y.1)

mean(x.0)
mean(x.1)
var(x.0)
var(x.1)

data.0 <- data.frame(y = y.0, x = x.0, x2 = x.0^2, colony = 0)
data.1 <- data.frame(y = y.1, x = x.1, x2 = x.1^2, colony = 1)
data.obs.100 <- rbind(data.0, data.1)

## Decomposition
dcmp.sim(data.obs.100)

fit.100 <- lm(y ~ x + I(x^2) + colony, data = data.obs.100)
stargazer(fit.100, type="text")


# Negative observable -----------------------------------------------------
set.seed(1234)
x.0 <- rnorm(n.sim, mean = 5, sd = 1.5)
x.1 <- c(rnorm(n.sim/2, mean = 3, sd = 1), rnorm(n.sim/2, mean = 7, sd = 1))

## Y
y.0 <- b.0(x.0) + e1 + 2
y.1 <- b.n(x.1) + e2 + 10

mean(y.1)
mean(y.0)

data.0 <- data.frame(y = y.0, x = x.0, x2 = x.0^2, colony = 0)
data.1 <- data.frame(y = y.1, x = x.1, x2 = x.1^2, colony = 1)
data.obs.n <- rbind(data.0, data.1)

## Decomposition
dcmp.sim(data.obs.n)




# Plot --------------------------------------------------------------------

# Figure 1: Synthetic data

pdf(file="output/figures/Figure1-simdata.pdf",width=11, height=8)
par(mfrow=c(2,3))
## Case 1 (old 4): Zero
plot(data.obs.0 $ x, data.obs.0 $ y, 
     col = ifelse(data.obs.0 $ colony == 0, "gray", "black"), 
     pch = ifelse(data.obs.0 $ colony == 0, 1, 3),
     xlab = "Resources", ylab = "Aid", 
     main = "Case 1", cex.lab = 1.2, cex.main = 1.7,
     xlim = c(0, 10), ylim = c(0, 20))
rug(data.obs.0 $ x[data.obs.0 $ colony==0], side = 1, col = "gray")
rug(data.obs.0 $ y[data.obs.0 $ colony==0], side = 4, col = "gray")
rug(data.obs.0 $ y[data.obs.0 $ colony==1], side = 2, col = "black")
rug(data.obs.0 $ x[data.obs.0 $ colony==1], side = 3, col = "black")
## fitted lines
lm1 <- coef(lm(y ~ x + I(x^2), data = data.obs.0, subset = data.obs.0 $ colony==1))
lm0 <- coef(lm(y ~ x + I(x^2), data = data.obs.0, subset = data.obs.0 $ colony==0))
f1 <- function(x) lm1[1] + lm1[2] * x + lm1[3] * x^2
f0 <- function(x) lm0[1] + lm0[2] * x + lm0[3] * x^2
plot(f1, xlim = c(2.5, 7.5), add=T)
plot(f0, xlim = c(1.5, 8.5), add=T, lty=2, lwd=2)
## Case 2 (old 2): 100% observable
plot(data.obs.100 $ x, data.obs.100 $ y,
     col = ifelse(data.obs.100 $ colony == 0, "gray", "black"), 
     pch = ifelse(data.obs.100 $ colony == 0, 1, 3),
     xlab = "Resources", ylab = "Aid", 
     main = "Case 2", cex.lab = 1.2, cex.main = 1.5,
     xlim = c(0, 10), ylim = c(0, 20))
rug(data.obs.100 $ x[data.obs.100 $ colony==0], side = 1, col = "gray")
rug(data.obs.100 $ y[data.obs.100 $ colony==0], side = 4, col = "gray")
rug(data.obs.100 $ y[data.obs.100 $ colony==1], side = 2, col = "black")
rug(data.obs.100 $ x[data.obs.100 $ colony==1], side = 3, col = "black")
## fitted lines
lm1 <- coef(lm(y ~ x + I(x^2), data = data.obs.100, subset = data.obs.100 $ colony==1))
lm0 <- coef(lm(y ~ x + I(x^2), data = data.obs.100, subset = data.obs.100 $ colony==0))
plot(f1, xlim = c(1.5, 8.5), add=T)
plot(f0, xlim = c(1.5, 8.5), add=T, lty=2, lwd=2)
## blank
plot(data.obs.n $ x, data.obs.n $ y, axes=F, type="n", xlab="", ylab="")
legend(x="center", y= "center", inset=0.1,
            legend = c("Colonial", "Non-colonial"),
            col = c("black", "gray"),
            pch = c(3, 1), cex=2,
            bg = "white")
## Case 3: 20% observable
plot(data.obs.25 $ x, data.obs.25 $ y,
     col = ifelse(data.obs.25 $ colony == 0, "gray", "black"), 
     pch = ifelse(data.obs.25 $ colony == 0, 1, 3),
     xlab = "Resources", ylab = "Aid", 
     main = "Case 3", cex.lab = 1.2, cex.main = 1.7,
     xlim = c(0, 10), ylim = c(0, 20))
rug(data.obs.25 $ x[data.obs.25 $ colony==0], side = 1, col = "gray")
rug(data.obs.25 $ y[data.obs.25 $ colony==0], side = 4, col = "gray")
rug(data.obs.25 $ y[data.obs.25 $ colony==1], side = 2, col = "black")
rug(data.obs.25 $ x[data.obs.25 $ colony==1], side = 3, col = "black")
## fitted lines
lm1 <- coef(lm(y ~ x + I(x^2), data = data.obs.25, subset = data.obs.25 $ colony==1))
lm0 <- coef(lm(y ~ x + I(x^2), data = data.obs.25, subset = data.obs.25 $ colony==0))
plot(f1, xlim = c(1.5, 8.5), add=T)
plot(f0, xlim = c(1.5, 8.5), add=T, lty=2, lwd=2)
## Case 4 (old 2): 75% observable
plot(data.obs.75 $ x, data.obs.75 $ y,
     col = ifelse(data.obs.75 $ colony == 0, "gray", "black"), 
     pch = ifelse(data.obs.75 $ colony == 0, 1, 3),
     xlab = "Resources", ylab = "Aid", 
     main = "Case 4", cex.lab = 1.2, cex.main = 1.7,
     xlim = c(0, 10), ylim = c(0, 20))
rug(data.obs.75 $ x[data.obs.75 $ colony==0], side = 1, col = "gray")
rug(data.obs.75 $ y[data.obs.75 $ colony==0], side = 4, col = "gray")
rug(data.obs.75 $ y[data.obs.75 $ colony==1], side = 2, col = "black")
rug(data.obs.75 $ x[data.obs.75 $ colony==1], side = 3, col = "black")
## fitted lines
lm1 <- coef(lm(y ~ x + I(x^2), data = data.obs.75, subset = data.obs.75 $ colony==1))
lm0 <- coef(lm(y ~ x + I(x^2), data = data.obs.75, subset = data.obs.75 $ colony==0))
plot(f1, xlim = c(1.5, 8.5), add=T)
plot(f0, xlim = c(1.5, 8.5), add=T, lty=2, lwd=2)
## Case 5: Negative
plot(data.obs.n $ x, data.obs.n $ y, 
     col = ifelse(data.obs.n $ colony == 0, "gray", "black"), 
     pch = ifelse(data.obs.n $ colony == 0, 1, 3),
     xlab = "Resources", ylab = "Aid", 
     main = "Case 5", cex.lab = 1.2, cex.main = 1.7,
     xlim = c(0, 10), ylim = c(0, 20))
rug(data.obs.n $ x[data.obs.n $ colony==0], side = 1, col = "gray")
rug(data.obs.n $ y[data.obs.n $ colony==0], side = 4, col = "gray")
rug(data.obs.n $ y[data.obs.n $ colony==1], side = 2, col = "black")
rug(data.obs.n $ x[data.obs.n $ colony==1], side = 3, col = "black")
## fitted lines
lm1 <- coef(lm(y ~ x + I(x^2), data = data.obs.n, subset = data.obs.n $ colony==1))
lm0 <- coef(lm(y ~ x + I(x^2), data = data.obs.n, subset = data.obs.n $ colony==0))
plot(f1, xlim = c(2.5, 7.5), add=T)
plot(f0, xlim = c(1.5, 8.5), add=T, lty=2, lwd=2)
dev.off()


# Regression --------------------------------------------------------------

# Regression models reported in Table 1
fit.n <- lm(y ~ colony + x + I(x^2), data = data.obs.n)
fit.0 <- update(fit.n, data = data.obs.0)
fit.25 <- update(fit.n, data = data.obs.25)
fit.75 <- update(fit.n, data = data.obs.75)
fit.100 <- update(fit.n, data = data.obs.100)

# Table 1
stargazer(fit.0, fit.100, fit.25, fit.75, fit.n,
          type = "text", omit.stat = c("f", "ser"), digits=2,
  title = "Table 1: Regression results of the synthetic data")

# latex output
stargazer(fit.0, fit.100, fit.25, fit.75, fit.n,
          type = "latex", omit.stat = c("f", "ser"), digits=2)

# Decomposition results shown at the bottom of Table 1
dcmp.sim(data.obs.0)   # Case 1
dcmp.sim(data.obs.100) # Case 2
dcmp.sim(data.obs.25)  # Case 3
dcmp.sim(data.obs.75)  # Case 4
dcmp.sim(data.obs.n)   # Case 5


# Output these into a file
sink(file = "output/tables/table1.txt")
stargazer(fit.0, fit.100, fit.25, fit.75, fit.n,
          type = "text", omit.stat = c("f", "ser"), digits=2,
          title = "Table 1: Regression results of the synthetic data")
cat("\n")
print("Decomposition for Case 1")
dcmp.sim(data.obs.0); cat("\n")
print("Decomposition for Case 2")
dcmp.sim(data.obs.100); cat("\n")
print("Decomposition for Case 3")
dcmp.sim(data.obs.25); cat("\n")
print("Decomposition for Case 4")
dcmp.sim(data.obs.75); cat("\n")
print("Decomposition for Case 5")
dcmp.sim(data.obs.n)
sink()



# Regression models reported in Table A.1 in the Appendix

fit.n.c <- lm(y ~ x + I(x^2), data = data.obs.n[data.obs.n $ colony == 1,])
fit.n.n <- update(fit.n.c, data = data.obs.n[data.obs.n $ colony == 0,])
fit.0.c <- update(fit.n.c, data = data.obs.0[data.obs.0 $ colony == 1,])
fit.0.n <- update(fit.n.c, data = data.obs.0[data.obs.0 $ colony == 0,])
fit.25.c <- update(fit.n.c, data = data.obs.25[data.obs.25 $ colony == 1,])
fit.25.n <- update(fit.n.c, data = data.obs.25[data.obs.25 $ colony == 0,])
fit.75.c <- update(fit.n.c, data = data.obs.75[data.obs.75 $ colony == 1,])
fit.75.n <- update(fit.n.c, data = data.obs.75[data.obs.75 $ colony == 0,])
fit.100.c <- update(fit.n.c, data = data.obs.100[data.obs.100 $ colony == 1,])
fit.100.n <- update(fit.n.c, data = data.obs.100[data.obs.100 $ colony == 0,])


stargazer(
  fit.0.c, fit.0.n, 
  fit.100.c, fit.100.n, 
  fit.25.c, fit.25.n, 
  fit.75.c, fit.75.n, 
  fit.n.c, fit.n.n, 
  type = "text", omit.stat = c("f", "ser"), digits=2,
  title = "Table A.1: Regression results of the simulated data by sample")


# Output this into a file
stargazer(
  fit.0.c, fit.0.n, 
  fit.100.c, fit.100.n, 
  fit.25.c, fit.25.n, 
  fit.75.c, fit.75.n, 
  fit.n.c, fit.n.n, 
  type = "text", omit.stat = c("f", "ser"), digits=2,
  title = "Table A.1: Regression results of the simulated data by sample",
  out = "output/tables/tableA1.txt")





# End of file
